This document contains plots and statistical analyses for Homm et al. “Can Untargeted RNA-Seq of Urine Samples Diagnose UTIs in Children?”.
library(DESeq2)
library(ggplot2)
library(biomaRt)
library(EnhancedVolcano)
library(factoextra)
library(org.Hs.eg.db)
library(ensembldb)
library(tximport)
library(AnnotationHub)
library(clusterProfiler)
library(AnnotationDbi)
library(ggvenn)
library(ComplexHeatmap)
library(reshape2)
library(dplyr)
library(tidyr)
library(readxl)
library(pROC)
library(circlize)
library(ggpubr)
library(effsize)
library(mclust)
library(classInt)
library(patchwork)
library(grid)
library(eulerr)
library(devtools)
library(xCell)
library(circlize)
library(plotly)
# List of your packages
pkgs <- c(
"DESeq2", "ggplot2", "biomaRt", "EnhancedVolcano", "factoextra", "org.Hs.eg.db",
"ensembldb", "tximport", "AnnotationHub", "clusterProfiler", "AnnotationDbi",
"ggvenn", "ComplexHeatmap", "reshape2", "dplyr", "tidyr", "readxl", "pROC",
"circlize", "ggpubr", "effsize", "mclust", "classInt", "patchwork", "grid",
"eulerr", "devtools", "xCell", "plotly"
)
# function to get version
get_version <- function(pkg) {
if (requireNamespace(pkg, quietly = TRUE)) {
as.character(packageVersion(pkg))
} else {
"Not Installed"
}
}
# create named vector of versions
pkg_versions <- sapply(pkgs, get_version)
# print nicely
versions_df <- data.frame(Package = names(pkg_versions), Version = pkg_versions)
print(versions_df, row.names = FALSE)
## Package Version
## DESeq2 1.44.0
## ggplot2 3.5.1
## biomaRt 2.60.1
## EnhancedVolcano 1.22.0
## factoextra 1.0.7
## org.Hs.eg.db 3.19.1
## ensembldb 2.28.1
## tximport 1.32.0
## AnnotationHub 3.12.0
## clusterProfiler 4.12.6
## AnnotationDbi 1.66.0
## ggvenn 0.1.10
## ComplexHeatmap 2.20.0
## reshape2 1.4.4
## dplyr 1.1.4
## tidyr 1.3.1
## readxl 1.4.3
## pROC 1.18.5
## circlize 0.4.16
## ggpubr 0.6.0
## effsize 0.8.1
## mclust 6.1.1
## classInt 0.4.11
## patchwork 1.3.0
## grid 4.4.2
## eulerr 7.0.2
## devtools 2.4.5
## xCell 1.1.0
## plotly 4.10.4
#' Contains RPM data for all species, along with corresponding metadata
load("../data/reads.meta.counts.239.RData")
#' Contains txi object with all Salmon-mapped human gene expression levels
load("../data/txi.data.239.RData")
#' Contains just RPM data for all species
load("../data/rpmdata.239.RData")
# ID patients that express over 5000 human genes
human.gene.counts <- apply(txi.data.239$counts, 2, function(x) sum(x != 0))
human.gene.over.thres <- subset(human.gene.counts, human.gene.counts > 5000)
human.gene.over.thres <- human.gene.over.thres[na.omit(names(human.gene.over.thres))]
# Subset txi.data to only include these patients
txi.data.239$abundance <- subset(txi.data.239$abundance, select = c(names(human.gene.over.thres)))
txi.data.239$counts <- subset(txi.data.239$counts, select = c(names(human.gene.over.thres)))
txi.data.239$length <- subset(txi.data.239$length, select = c(names(human.gene.over.thres)))
# Calculate the mean counts of each gene
gene_means <- rowMeans(txi.data.239$counts)
# Filter out genes with mean counts less than 1
txi.data.239$counts <- txi.data.239$counts[gene_means > 1, ]
txi.data.239$abundance <- txi.data.239$abundance[gene_means > 1, ]
txi.data.239$length <- txi.data.239$length[gene_means > 1, ]
print(dim(txi.data.239$abundance))
## [1] 41617 214
col_data <- data.frame(row.names = colnames(txi.data.239$counts), condition = factor(rep(1, ncol(txi.data.239$counts))))
col_data <- cbind(col_data, reads.meta.counts.239[human.gene.over.thres , ])
dds.unsupervised <- DESeqDataSetFromTximport(txi.data.239, colData = col_data, design = ~1)
dds.unsupervised <- DESeq(dds.unsupervised)
# Initialize df's to be used in downstream analysis
reads.meta.counts <- reads.meta.counts.239[names(human.gene.over.thres) , ]
rpmdata <- rpmdata.239[names(human.gene.over.thres) , ]
txi.data <- txi.data.239
# Obtain reads data
reads <- reads.meta.counts$total.reads
human.reads <- reads.meta.counts$total.reads * (rpmdata$`Homo sapiens` / 1000000)
data <- data.frame(
patient = 1:length(human.reads),
non_human_reads = reads - human.reads,
human_reads = human.reads
)
# Compute proportion of human reads
data$human_read_proportion <- data$human_reads / reads
# Order patients by human read proportion
data <- data %>% arrange(human_read_proportion)
# Convert to long format for stacked bar plot
long_data <- data %>%
pivot_longer(cols = c(human_reads, non_human_reads), names_to = "Read Type", values_to = "value")
# Ensure "human_reads" is the first level so it appears at the bottom
long_data$`Read Type` <- factor(long_data$`Read Type`, levels = c("human_reads", "non_human_reads"))
# Create the stacked bar plot
stacked_bar <- ggplot(long_data, aes(x = factor(patient, levels = data$patient), y = value, fill = `Read Type`)) +
geom_col(position = "stack") +
scale_fill_manual(values = c("human_reads" = "blue", "non_human_reads" = "red")) +
labs(x = "214 Patients", y = "Number of Reads", fill = "Read Type") +
theme_minimal() +
theme(
axis.text.x = element_blank(),
panel.grid = element_blank(),
panel.border = element_rect(color = "black", fill = NA, linewidth = 1),
legend.text = element_text(size = 15),
legend.title = element_text(size = 15)
)
# Create a thin heatmap at the bottom
heatmap <- ggplot(data, aes(x = factor(patient, levels = data$patient), y = 1, fill = human_read_proportion)) +
geom_tile() +
scale_fill_gradient(low = "grey", high = "darkgreen",
limits = c(0, 1),
breaks = c(0, 0.25, 0.5, 0.75, 1)) +
labs(fill = "Human Read Proportion") +
theme_void() +
theme(
legend.position = "bottom",
legend.text = element_text(size = 12),
legend.title = element_text(size = 12),
legend.key.height = unit(0.5, "cm"),
legend.key.width = unit(1.5, "cm"),
aspect.ratio = 0.05,
legend.title.align = -1,
panel.border = element_rect(color = "black", fill = NA, linewidth = 1)
)
# Combine both plots using patchwork with adjusted heights
final_plot <- stacked_bar / heatmap + plot_layout(heights = c(4, 1))
print(final_plot)
normalized_counts <- counts(dds.unsupervised, normalized = TRUE)
vsd <- vst(dds.unsupervised, blind = TRUE)
pca_data <- plotPCA(vsd, returnData = TRUE)
pca_df <- data.frame(
PC1 = pca_data$PC1,
PC2 = pca_data$PC2,
Sample = rownames(pca_data)
)
# mimic plotPCA internals to get more PCs
ntop <- 500
rv <- rowVars(assay(vsd))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
pca <- prcomp(t(assay(vsd)[select, ]))
# make data frame for plotting
pca_df_3d <- data.frame(
PC1 = pca$x[,1],
PC2 = pca$x[,2],
PC3 = pca$x[,3]
)
# interactive 3D plot
plot_ly(
data = pca_df_3d,
x = ~PC1, y = ~PC2, z = ~PC3,
type = "scatter3d",
mode = "markers"
)
# Extract the variance of each
summary(pca)$importance[2, 1]
## [1] 0.69163
summary(pca)$importance[2, 2]
## [1] 0.03727
summary(pca)$importance[2, 3]
## [1] 0.03002
set.seed(1)
# Run k-means clustering, k = 2
kmeans_result_pca <- kmeans(pca_df[ , c("PC1", "PC2")], centers = 2, nstart = 25)
# Add cluster assignments to your data frame
pca_df$cluster <- as.factor(kmeans_result_pca$cluster)
cluster_colors <- c("red", "blue")
# Create the plot comparing PC1 and PC2
k_means_plot_pca <- ggplot(pca_df, aes(x = PC1, y = PC2, color = factor(cluster))) +
geom_point(size = 3, alpha = 0.8) +
stat_ellipse(aes(fill = factor(cluster)), geom = "polygon",
alpha = 0.3,
color = "black", # outline to make visible
level = 0.95,
show.legend = TRUE) +
labs(
title = "PCA Plot with K-means Clusters (k = 2)",
x = "PC1",
y = "PC2",
color = "Cluster",
fill = "Cluster"
) +
theme_minimal(base_size = 16) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 18),
axis.title = element_text(face = "bold"),
legend.position = "right",
legend.text = element_text(size = 14),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(color = "black", fill = NA, size = 1.5)
) +
scale_color_manual(values = cluster_colors) +
scale_fill_manual(values = cluster_colors)
# Display the plot
k_means_plot_pca
# Perform k-means clustering using 3 PCs
kmeans_result_pca_3d <- kmeans(pca_df_3d[, c("PC1", "PC2", "PC3")], centers = 2, nstart = 25)
# Add cluster assignments to your data frame
pca_df_3d$cluster <- as.factor(kmeans_result_pca_3d$cluster)
# Set color palette
cluster_colours <- c("1" = "red", "2" = "blue")
# 3D PCA plot with k-means clusters
k_means_plot_3d <- plot_ly(
data = pca_df_3d,
x = ~PC1,
y = ~PC2,
z = ~PC3,
color = ~cluster,
colors = cluster_colours,
type = "scatter3d",
mode = "markers",
marker = list(size = 5, opacity = 0.8)
) %>%
layout(
title = list(text = "3D PCA Plot with K-means Clusters (k = 2)", x = 0.5),
scene = list(
xaxis = list(title = "PC1"),
yaxis = list(title = "PC2"),
zaxis = list(title = "PC3")
)
)
# Display the plot
k_means_plot_3d
# Re-name the clusters to 0/1
patients.over.thres <- names(human.gene.over.thres)
k_groups <- kmeans_result_pca$cluster
k_groups <- ifelse(k_groups == 2, 0, 1)
names(k_groups) <- patients.over.thres
# Create DESeq metadata
k_groups_metadata <- data.frame(group = k_groups,
group_word = sapply(k_groups, function(x) if (x == 1) {return("uti")} else {return("no_uti")}))
# Run DESeq on cluster 1 vs. cluster 2, using the original txi.data
dds.kmeans <- DESeqDataSetFromTximport(txi.data, colData = k_groups_metadata, design = ~group_word)
# Brief moment to run vst again
vst.for.heatmap <- varianceStabilizingTransformation(dds.kmeans, blind = TRUE)
# Run DESeq2
dds.kmeans <- DESeq(dds.kmeans)
# Isolate the volcano info
res.human <- results(dds.kmeans, contrast = c("group_word", "uti", "no_uti"))
# Map ensembl ids to their actual gene names
res.df.human <- as.data.frame(res.human)
res.df.human$symbol <- mapIds(org.Hs.eg.db, keys = rownames(res.df.human), keytype = "ENSEMBL", column = "SYMBOL")
genes_to_label <- c("LIMK2", "CSF3R", "BCL6", "NFKB1", "SELL", "JAK3")
# Create the volcano
volc <- EnhancedVolcano(res.df.human,
x = "log2FoldChange",
y = "padj",
lab = res.df.human$symbol,
selectLab = genes_to_label,
drawConnectors = TRUE,
pointSize = 3.0,
title = "DESeq2 with K-means Groups",
subtitle = "",
FCcutoff = 1,
pCutoff = 0.05)
volc
enrichment <- function(res.df, padj_thres, log2fc_thres, up.or.down, save = FALSE) {
if (up.or.down == "up") {
sig.genes <- rownames(res.df[res.df$log2FoldChange > log2fc_thres & res.df$padj < padj_thres, ])
sig.genes <- sig.genes[!is.na(sig.genes) & !grepl("^NA", sig.genes)]
title = "Upregulated Pathways"
} else {
sig.genes <- rownames(res.df[res.df$log2FoldChange < -log2fc_thres & res.df$padj < padj_thres, ])
sig.genes <- sig.genes[!is.na(sig.genes) & !grepl("^NA", sig.genes)]
title = "Downregulated Pathways"
}
# Run GO analysis
GO_results <- enrichGO(gene = sig.genes, OrgDb = "org.Hs.eg.db", keyType = "ENSEMBL", ont = "BP")
GO_results_df <- as.data.frame(GO_results)
# Print p-values for debugging
print("Original p.adjust values:")
print(head(GO_results_df$p.adjust))
# Create plot using the enrichResult object (GO_results)
fit.up <- barplot(GO_results, showCategory = 100) +
xlab("Number of Genes") +
ggtitle(title) +
theme(
plot.title = element_text(size = 15, face = "bold", hjust = 0.5),
axis.text.y = element_text(size = 12)
) +
scale_fill_gradient(
low = "#e73512",
high = "#ee794d",
name = "Adjusted p-value",
trans = "log10", # Log transform for small p-values
labels = scales::scientific_format() # Force scientific notation
)
if (save) {
ggsave(
filename = paste0(up.or.down, ".pdf"),
plot = fit.up,
device = "pdf",
width = 12,
height = 10
)
}
print(paste("Number of significant genes:", length(sig.genes)))
return(fit.up)
}
# Run enrichment function on up/down-regulated genes
upreg.plot <- enrichment(res.df.human, 0.05, 1, "up", save = FALSE)
## [1] "Original p.adjust values:"
## [1] 1.459751e-40 7.144569e-40 2.801856e-39 3.411580e-39 4.370757e-39
## [6] 8.615696e-39
## [1] "Number of significant genes: 3301"
downreg.plot <- enrichment(res.df.human, 0.05, 1, "down", save = FALSE)
## [1] "Original p.adjust values:"
## [1] 1.938446e-33 8.969342e-32 4.342786e-28 5.029298e-24 3.873654e-23
## [6] 7.331487e-23
## [1] "Number of significant genes: 6030"
# Remove human reads
df_no_human <- rpmdata[, !colnames(rpmdata) %in% "Homo sapiens"]
total_rpm <- rowSums(df_no_human)
# Normalize dataframe back to RPM
df_renormalized <- as.data.frame(sweep(df_no_human, 1, total_rpm, "/") * 1e6)
df_renormalized[is.na(df_renormalized)] <- 0
df_renormalized <- df_renormalized[patients.over.thres , ]
# Seven common UTI-associated pathogens
seven.pathogens <- c("Escherichia coli", "Klebsiella pneumoniae", "Enterococcus faecalis", "Proteus mirabilis",
"Pseudomonas aeruginosa", "Staphylococcus saprophyticus", "Streptococcus agalactiae")
# Subset to only include reads for these seven pathogens
df_subset <- df_renormalized[ , colnames(df_renormalized) %in% seven.pathogens]
rpm_sums <- rowSums(df_subset, na.rm = TRUE)
rpm_sums <- rpm_sums[patients.over.thres]
# Logs the final summation of normalized pathogen abundance
rpm_sums <- sapply(rpm_sums, function(x) max(0, log10(x)))
# Run jenks classification algorithm to find optimal threshold
pathogen_abundance = rpm_sums
breaks <- classIntervals(pathogen_abundance, n=2, style="jenks")
threshold <- max(breaks$brks[-length(breaks$brks)])
print(paste0("Threshold = ", threshold))
## [1] "Threshold = 4.0107193789017"
# View dist of these abundance scores
hist(rpm_sums, breaks = 12, main = title("Logged RPM, Human Reads Excluded"))
abline(v = threshold, col = "red", lty = 2, lwd = 2)
# View pathogen abundance scores across random y-axis
set.seed(1234)
# Create a data frame with random y-values
abundance.df <- data.frame(
x = rpm_sums,
y = runif(length(rpm_sums), min = 0, max = 1),
colour = ifelse(rpm_sums > threshold, "Positive", "Negative")
)
rpm.jitter <- ggplot(abundance.df, aes(x = x, y = y, color = colour)) +
geom_jitter(height = 0, width = 0.1, alpha = 0.6, size = 2.5) +
geom_vline(xintercept = threshold, color = "black", linetype = "dashed", linewidth = 1) +
labs(
title = "RPM Sum Distribution",
x = expression(log[10] ~ "(Pathogen Abundance Score, RPM)"),
y = "Random Y-Value",
color = "Cluster"
) +
scale_color_manual(values = c("Positive" = "red", "Negative" = "blue")) +
theme_classic(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
axis.title = element_text(),
legend.position = "right",
panel.border = element_rect(color = "black", fill = NA, linewidth = 1)
)
print(rpm.jitter)
heatmap.df <- t(df_renormalized[patients.over.thres, seven.pathogens])
heatmap.df <- apply(heatmap.df, c(1, 2), function(x) max(log10(x), 0))
rownames(heatmap.df) <- sapply(rownames(heatmap.df), function(x) gsub(" ", "\n", x))
# Convert the heatmap.df matrix to a long-format data frame
bubble_data <- melt(heatmap.df)
colnames(bubble_data) <- c("Row", "Column", "Value")
# Get the E coli row values
e_coli_values <- heatmap.df["Escherichia\ncoli", ]
# Reorder the columns based on the E coli values (descending order)
bubble_data$Column <- factor(bubble_data$Column, levels = names(sort(e_coli_values, decreasing = TRUE)))
bubble_plot <- ggplot(bubble_data, aes(x = Column, y = Row, size = Value, fill = Value)) +
geom_point(shape = 21, color = "black") +
scale_size_continuous(range = c(1, 10)) +
scale_fill_gradient(low = "white", high = "red") +
labs(
x = "214 Patients",
y = "Pathogen",
size = "log10(RPM)",
fill = "log10(RPM)"
) +
theme_minimal() +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.y = element_text(size = 12, face = "italic"),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
panel.border = element_rect(color = "black", fill = NA, linewidth = 1)
)
print(bubble_plot)
log.reg.df <- data.frame(df_subset)
# Ecoli
log.reg.df$ecoli <- ifelse(reads.meta.counts$ecoli == 1, 1, 0)
log.reg.df$ecoli[is.na(log.reg.df$ecoli)] <- 0
ecoli <- glm(ecoli ~ Escherichia.coli, data = log.reg.df, family = binomial)
ecoli_probs <- predict(ecoli, type = "response")
auc_ecoli <- roc(log.reg.df$ecoli, ecoli_probs)
print(auc_ecoli)
##
## Call:
## roc.default(response = log.reg.df$ecoli, predictor = ecoli_probs)
##
## Data: ecoli_probs in 106 controls (log.reg.df$ecoli 0) < 108 cases (log.reg.df$ecoli 1).
## Area under the curve: 0.9895
plot(auc_ecoli, col = "blue", main = "ROC Curve for E. coli")
# Kleb
log.reg.df$kleb <- ifelse(reads.meta.counts$klebsiella == 1, 1, 0)
log.reg.df$kleb[is.na(log.reg.df$kleb)] <- 0
kleb <- glm(kleb ~ Klebsiella.pneumoniae, data = log.reg.df, family = binomial)
kleb_probs <- predict(kleb, type = "response")
auc_kleb <- roc(log.reg.df$kleb, kleb_probs)
print(auc_kleb)
##
## Call:
## roc.default(response = log.reg.df$kleb, predictor = kleb_probs)
##
## Data: kleb_probs in 211 controls (log.reg.df$kleb 0) < 3 cases (log.reg.df$kleb 1).
## Area under the curve: 0.7196
plot(auc_kleb, col = "blue", main = "ROC Curve for K. pneumoniae")
# EC
log.reg.df$ec <- ifelse(reads.meta.counts$enterococcus == 1, 1, 0)
log.reg.df$ec[is.na(log.reg.df$ec)] <- 0
ec <- glm(ec ~ Enterococcus.faecalis, data = log.reg.df, family = binomial)
ec_probs <- predict(ec, type = "response")
auc_ec <- roc(log.reg.df$ec, ec_probs)
print(auc_ec)
##
## Call:
## roc.default(response = log.reg.df$ec, predictor = ec_probs)
##
## Data: ec_probs in 209 controls (log.reg.df$ec 0) < 5 cases (log.reg.df$ec 1).
## Area under the curve: 0.9943
plot(auc_ec, col = "blue", main = "ROC Curve for E. faecalis")
# Proteus
log.reg.df$prot <- ifelse(reads.meta.counts$proteus == 1, 1, 0)
log.reg.df$prot[is.na(log.reg.df$prot)] <- 0
prot <- glm(prot ~ Proteus.mirabilis, data = log.reg.df, family = binomial)
prot_probs <- predict(prot, type = "response")
auc_prot <- roc(log.reg.df$prot, prot_probs)
print(auc_prot)
##
## Call:
## roc.default(response = log.reg.df$prot, predictor = prot_probs)
##
## Data: prot_probs in 209 controls (log.reg.df$prot 0) < 5 cases (log.reg.df$prot 1).
## Area under the curve: 1
plot(auc_prot, col = "blue", main = "ROC Curve for P. mirabilis")
# P. aeruginosa
log.reg.df$pa <- ifelse(reads.meta.counts$mainpathogen == "Pseudomonas Aeruginosa", 1, 0)
log.reg.df$pa[is.na(log.reg.df$pa)] <- 0
pa <- glm(pa ~ Pseudomonas.aeruginosa, data = log.reg.df, family = binomial)
pa_probs <- predict(pa, type = "response")
auc_pa <- roc(log.reg.df$pa, pa_probs)
print(auc_pa)
##
## Call:
## roc.default(response = log.reg.df$pa, predictor = pa_probs)
##
## Data: pa_probs in 213 controls (log.reg.df$pa 0) < 1 cases (log.reg.df$pa 1).
## Area under the curve: 1
plot(auc_pa, col = "blue", main = "ROC Curve for P. aeruginosa")
plot_filtered_rpm_heatmap <- function(rpm_df, abundance_threshold = 1, log_transform = TRUE, cluster_rows = TRUE, cluster_columns = TRUE, top_n_species = NULL) {
# Check input
if (!is.data.frame(rpm_df)) stop("Input must be a data frame.")
# Remove any non-numeric columns (e.g., metadata)
rpm_mat <- rpm_df %>% dplyr::select(where(is.numeric)) %>% as.matrix()
rownames(rpm_mat) <- rownames(rpm_df)
# Filter species (columns) based on abundance threshold in any sample
# Threshold is in percent — so convert to raw RPM
filtered_mat <- rpm_mat[ , apply(rpm_mat, 2, function(col) any(col > abundance_threshold)), drop = FALSE]
# Optional: keep only top N most variable species
if (!is.null(top_n_species)) {
species_var <- apply(filtered_mat, 2, var)
top_species <- names(sort(species_var, decreasing = TRUE))[1:min(top_n_species, length(species_var))]
filtered_mat <- filtered_mat[, top_species, drop = FALSE]
}
# Log-transform to reduce skew (optional)
if (log_transform) {
filtered_mat <- log10(filtered_mat + 1) # add 1 to avoid log(0)
}
# Heatmap color scale
col_fun <- colorRamp2(c(min(filtered_mat), max(filtered_mat)), c("white", "red"))
# Plot heatmap
Heatmap(filtered_mat,
name = "RPM",
col = col_fun,
show_row_names = FALSE,
show_column_names = TRUE,
cluster_rows = cluster_rows,
cluster_columns = cluster_columns,
row_title = "214 Patients",
column_title = "Bracken Output Including Human",
column_names_gp = grid::gpar(fontsize = 4),
heatmap_legend_param = list(title = ifelse(log_transform, "log10(RPM+1)", "RPM")))
}
without_human <- plot_filtered_rpm_heatmap(data.frame(df_renormalized), abundance_threshold = 10000)
with_human <- plot_filtered_rpm_heatmap(data.frame(rpmdata), abundance_threshold = 500)
print(with_human)
print(without_human)
# Create a patient order based on k-means group: positive (1) then negative (0)
ordered_k_groups <- k_groups[order(k_groups, decreasing = TRUE)]
ordered_k_groups_name <- names(ordered_k_groups)
ordered_k_groups <- sapply(ordered_k_groups, function(x) ifelse(x == 0, 2, 1))
# Find up and down genes
sig.genes.up <- rownames(res.df.human[res.df.human$log2FoldChange > 2 & res.df.human$padj < 1e-25 , ])
sig.genes.up <- sig.genes.up[!is.na(sig.genes.up) & !grepl("^NA", sig.genes.up)]
sig.genes.down <- rownames(res.df.human[res.df.human$log2FoldChange < -2 & res.df.human$padj < 1e-25 , ])
sig.genes.down <- sig.genes.down[!is.na(sig.genes.down) & !grepl("^NA", sig.genes.down)]
# Bind these genes together
sig.genes.heatmap <- c(sig.genes.up, sig.genes.down)
sig.genes.heatmap <- sig.genes.heatmap[!is.na(sig.genes.heatmap) & !grepl("^NA", sig.genes.heatmap)]
# Format the z-score-based heatmap fill
temp = t(scale(t(assay(vst.for.heatmap[,]))))
temp[temp < -4] = -4
temp[temp > +4] = +4
temp[is.na(temp)] = 0
temp <- temp[sig.genes.heatmap , ]
# Add first col annotation- the k-means cluster
col_anno <- HeatmapAnnotation(k_means_group = ordered_k_groups,
col = list(ordered_k_groups = c("2" = "blue", "1" = "red")))
# Add second col annotation, unlogged pathogen abundance score bar plot
rpm_sums_to_use <- rpm_sums[ordered_k_groups_name]
col_anno <- HeatmapAnnotation(
`Unlogged Pathogen\nAbundance Score` = anno_barplot(10^rpm_sums_to_use, gp = gpar(fill = "gray")),
`k-means cluster` = ordered_k_groups,
col = list(`k-means cluster` = c("2" = "blue", "1" = "red")),
annotation_name_gp = gpar(fontsize = 10, fontface = "bold"),
annotation_name_rot = 0
)
# Side annotation
# Do this for binary now to help for the row annotation
binary.up <- rep("1", length(sig.genes.up))
binary.down <- rep("2", length(sig.genes.down))
binary.combined <- c(binary.up, binary.down)
names(binary.combined) <- sig.genes.heatmap
row_anno <- rowAnnotation(
`Gene Expression` = binary.combined,
col = list(`Gene Expression` = c("2" = "blue", "1" = "red")),
annotation_name_gp = gpar(fontsize = 10, fontface = "bold"),
annotation_name_rot = 0,
annotation_name_offset = unit(2, "mm")
)
# Generate heatmap without x-axis, y-axis labels, and annotation labels
gene.heatmap <- Heatmap(
temp[ , ordered_k_groups_name],
show_column_names = FALSE,
show_row_names = FALSE,
column_order = ordered_k_groups_name,
top_annotation = col_anno,
right_annotation = row_anno,
heatmap_legend_param = list(
title = "Z-score\ngene expression",
labels_gp = gpar(fontsize = 10),
title_gp = gpar(fontsize = 10, fontface = "bold"),
position = "right",
legend_gp = gpar(fontsize = 28)
))
gene.heatmap
# Create a data frame for plotting
plot_data <- data.frame(
k_means_group = factor(ordered_k_groups, levels = unique(ordered_k_groups)),
rpm_sums = rpm_sums_to_use
)
# Assign colors based on k-means group position (left = red, right = blue)
plot_data$point_color <- ifelse(as.numeric(plot_data$k_means_group) <= length(unique(ordered_k_groups)) / 2, "red", "blue")
plot_data$k_means_group <- factor(plot_data$k_means_group, levels = c("1", "2"), labels = c("Group 1", "Group 2"))
# Perform a stat test to confirm what p val is for both samples:
shapiro_results <- by(plot_data$rpm_sums, plot_data$k_means_group, shapiro.test)
print(shapiro_results)
## plot_data$k_means_group: Group 1
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.51495, p-value < 2.2e-16
##
## ------------------------------------------------------------
## plot_data$k_means_group: Group 2
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.92623, p-value = 1.294e-05
# Since not normal, follow wilcox test:
wilcox_result <- wilcox.test(rpm_sums ~ k_means_group, data = plot_data)
print(wilcox_result$p.value)
## [1] 1.492372e-29
# Effect size calculation using Cliff's Delta
cliffs_delta <- cliff.delta(plot_data$rpm_sums[plot_data$k_means_group == "Group 1"],
plot_data$rpm_sums[plot_data$k_means_group == "Group 2"])
print(cliffs_delta)
##
## Cliff's Delta
##
## delta estimate: 0.8935315 (large)
## 95 percent confidence interval:
## lower upper
## 0.8091849 0.9417936
violin <- ggplot(plot_data, aes(x = k_means_group, y = 10^rpm_sums, fill = k_means_group)) +
geom_violin(alpha = 0.5, scale = "width") + # Violin plot
geom_boxplot(width = 0.2, outlier.shape = NA, color = "black", alpha = 0.7) +
geom_jitter(aes(color = point_color), width = 0.2, size = 2, alpha = 0.7) +
scale_fill_manual(values = c("red", "blue")) +
scale_color_identity() +
theme_minimal() +
theme(
legend.position = "none",
panel.border = element_rect(color = "black", fill = NA, linewidth = 1)
) +
labs(x = "K-means Group", y = "Pathogen Abundance (RPM)", title = "Violin Plot of RPM Sums by K-means Group")
violin
par(pty = "s")
cluster <- ifelse(kmeans_result_pca$cluster == 2, 0, 1)
regression_1 <- glm(cluster ~ rpm_sums, data = data.frame(cluster = cluster, rpm_sums), family = binomial)
regression_1_probs <- predict(regression_1, type = "response")
auc_regression_1 <- roc(cluster, regression_1_probs)
print(auc_regression_1)
##
## Call:
## roc.default(response = cluster, predictor = regression_1_probs)
##
## Data: regression_1_probs in 110 controls (cluster 0) < 104 cases (cluster 1).
## Area under the curve: 0.9468
plot(auc_regression_1, col = "blue", main = "Pathogen Abundance Score Predicts RNA-Seq Host Response", asp = 1, lwd = 3)
generate.scatter <- function(gene, ensemblID, tpm, rpm_sums, threshold, save = FALSE) {
# Get TPM values for selected gene
score <- tpm[ensemblID, ]
# Create data frame
scatter.df <- data.frame(rpm_sums, score)
cor_test <- cor.test(scatter.df$rpm_sums, scatter.df$score)
# Create scatter plot
scatter <- ggplot(scatter.df, aes(x = rpm_sums, y = score)) +
geom_smooth(method = "lm", se = FALSE, color = "red") +
geom_point(size = 3) +
labs(
title = paste(gene, "Expression vs. UTI Pathogen Abundance"),
x = "log10(Pathogen RPM Score)",
y = paste(gene, "TPM")
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"), # Center and bold title
text = element_text(size = 14), # Increase overall text size
panel.border = element_rect(color = "black", fill = NA, size = 1.5) # Add border
)
# Create a grouping variable for threshold
scatter.df$group <- ifelse(scatter.df$rpm_sums >= threshold, "Positive", "Negative")
# Create the jitter + boxplot with t-test annotation
jitter_boxplot <- ggplot(scatter.df, aes(x = group, y = score, color = group, fill = group)) +
geom_boxplot(outlier.shape = NA, alpha = 0.4, color = "black") + # Boxplot without outliers
geom_jitter(width = 0.2, size = 3, alpha = 0.7) +
scale_color_manual(values = c("Negative" = "blue", "Positive" = "red")) +
scale_fill_manual(values = c("Negative" = "blue", "Positive" = "red")) +
labs(
title = paste(gene, "Expression by Pathogen Abundance Group"),
x = "Pathogen Group",
y = paste(gene, "TPM")
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
text = element_text(size = 14),
panel.border = element_rect(color = "black", fill = NA, size = 1.5)
) +
stat_compare_means(method = "t.test", label = "p.format",
label.y = max(scatter.df$score) * 1.05)
# Save plots if needed
if (save) {
output_file = paste0(gene, ".pdf")
pdf(output_file, width = 8, height = 6)
print(scatter)
print(jitter_boxplot)
dev.off()
}
# Return plots as a list
return(list(scatter = scatter, jitter = jitter_boxplot))
}
# Use sample genes
NFKB2 <- generate.scatter("NFKB2", "ENSG00000077150", txi.data$abundance, rpm_sums, threshold = threshold, save = FALSE)
JAK3 <- generate.scatter("JAK3", "ENSG00000105639", txi.data$abundance, rpm_sums, threshold = threshold, save = FALSE)
LIMK2 <- generate.scatter("LIMK2", "ENSG00000182541", txi.data$abundance, rpm_sums, threshold = threshold, save = FALSE)
CSF3R <- generate.scatter("CSF3R", "ENSG00000119535", txi.data$abundance, rpm_sums, threshold = threshold, save = FALSE)
IL8 <- generate.scatter("IL8", "ENSG00000169429", txi.data$abundance, rpm_sums, threshold = threshold, save = FALSE)
TNF <- generate.scatter("TNF", "ENSG00000232810", txi.data$abundance, rpm_sums, threshold = threshold, save = FALSE)
HLAA <- generate.scatter("HLAA", "ENSG00000206503", txi.data$abundance, rpm_sums, threshold = threshold, save = FALSE)
# Print them
print(NFKB2$jitter)
print(JAK3$jitter)
print(LIMK2$jitter)
print(CSF3R$jitter)
print(IL8$jitter)
print(TNF$jitter)
print(HLAA$jitter)
# First, create the df containing all four binary classifications (venn.df)
# 1 -> RNA-seq pathogen abundance
rpm_sums_binary <- ifelse(rpm_sums > threshold, 1, 0)
# 2 -> Clinical host response (leukocyte esterase)
enrleuks <- reads.meta.counts$enrleuks
binary.enrleuks <- ifelse(enrleuks == 1 | is.na(enrleuks), 0, 1)
# 3 -> RNA-seq host response (k-means clusters, already computed above in 'cluster' object)
# 4 -> Clinical pathogen abundance
clin <- reads.meta.counts$uti
venn.df <- data.frame(patients.over.thres,
cluster,
rpm_sums_binary,
binary.enrleuks,
clin)
# remove patient 20070, because they do not have leukocyte esterase activity data (n = 213 now)
venn.df <- venn.df %>% filter(patients.over.thres != 20070)
# This will only be used for the ROC analysis predicting leukocyte esterase
rpm_sums_temp <- rpm_sums[names(rpm_sums) != 20070]
# Add an all-zero row to count patients with no group memberships
venn.df <- venn.df[ , c(2, 3, 4, 5)]
colnames(venn.df) <- c("RNA-Seq Host Response Cluster", "RNA-Seq Pathogen Abundance",
"Clinical Host Response Score", "Clinical Pathogen Abundance")
# Generate Euler diagram
new.venn <- euler(venn.df)
# Plot four overlapping categories
plot(new.venn,
quantities = TRUE,
fills = c("cornflowerblue", "lightcoral", "goldenrod1", "mediumseagreen"),
alpha = 0.6,
labels = list(font = 2, cex = 1.2),
edges = list(col = "black", lwd = 2),
legend = TRUE)
rnaseq_venn <- euler(venn.df[ , c(1,2)])
par(mar = c(5, 5, 5, 5))
# Plot RNA-seq host response versus pathogen abundance
plot(rnaseq_venn,
quantities = list(cex = 1.5),
fills = c("red", "blue"),
alpha = 0.6,
labels = FALSE,
edges = list(col = "black", lwd = 2),
legend = TRUE,
quantities.cex = 10)
# Plot clinical host response versus pathogen abundance
clinical_venn <- euler(venn.df[ , c(3,4)])
plot(clinical_venn,
quantities = list(cex = 1.5),
fills = c("green", "purple"),
alpha = 0.6,
edges = list(col = "black", lwd = 2),
legend = TRUE,
quantities.cex = 10)
# Create a list of sets for all positive diagnoses across groups
sets <- list(
"RNA-Seq Host\nResponse Score" = which(venn.df[ , 1] == 1),
"RNA-Seq\nTaxonomic Score" = which(venn.df[ , 2] == 1),
"Clinical Host Response\n(Leukocyte Esterase)" = which(venn.df[ , 3] == 1),
"Clinical Taxonomic\nScore (Culture)" = which(venn.df[ , 4] == 1)
)
# Plot the Venn diagram
venn_plot <- ggvenn(sets,
fill_color = c("red", "blue", "green", "purple"),
text_size = 5,
set_name_size = 5)
# Should add a count for all negative patients
all_zero_count <- sum(rowSums(venn.df) == 0)
# Add annotation to the plot
venn_plot +
annotate(
"text",
x = 0.5, y = -2,
label = paste("All zero count:", all_zero_count),
size = 5,
hjust = 0.5
)
# 1) -> Just RNA-Seq
# Create a list of sets
sets1 <- list(
"RNA-Seq HR" = which(venn.df[ , 1] == 1),
"RNA-Seq Tax" = which(venn.df[ , 2] == 1))
# Plot the Venn diagram
venn_plot1 <- ggvenn(sets1,
fill_color = c("red", "blue"),
text_size = 4,
set_name_size = 4)
venn_plot1
# 2) -> Just Clinical
# Create a list of sets
sets2 <- list(
"Clinical HR" = which(venn.df[ , 3] == 1),
"Clinical Tax" = which(venn.df[ , 4] == 1))
# Plot the Venn diagram
venn_plot2 <- ggvenn(sets2,
fill_color = c("green", "purple"),
text_size = 4,
set_name_size = 4)
venn_plot2
rnaseq_positives <- rownames(venn.df[venn.df$`RNA-Seq Host Response Cluster` == venn.df$`RNA-Seq Pathogen Abundance` & venn.df$`RNA-Seq Host Response Cluster` == 1, ])
rnaseq_negatives <- rownames(venn.df[venn.df$`RNA-Seq Host Response Cluster` == venn.df$`RNA-Seq Pathogen Abundance` & venn.df$`RNA-Seq Host Response Cluster` == 0, ])
clinical_positives <- rownames(venn.df[venn.df$`Clinical Host Response Score` == venn.df$`Clinical Pathogen Abundance` & venn.df$`Clinical Host Response Score` == 1, ])
clinical_negatives <- rownames(venn.df[venn.df$`Clinical Host Response Score` == venn.df$`Clinical Pathogen Abundance` & venn.df$`Clinical Host Response Score` == 0, ])
all_patients <- unique(c(clinical_negatives, clinical_positives, rnaseq_positives, rnaseq_negatives))
patient_df <- data.frame(
patient_id = all_patients,
clinical = ifelse(all_patients %in% clinical_positives, "clinical positive", "clinical negative"),
rnaseq = ifelse(all_patients %in% rnaseq_positives, "RNA-seq positive", "RNA-seq negative")
)
table(patient_df$clinical, patient_df$rnaseq)
##
## RNA-seq negative RNA-seq positive
## clinical negative 96 1
## clinical positive 10 99
venn_sets <- list(
Clinical_Pos = clinical_positives,
Clinical_Neg = clinical_negatives,
RNAseq_Pos = rnaseq_positives,
RNAseq_Neg = rnaseq_negatives
)
fit <- euler(venn_sets)
plot(fit, fills = c("#1b9e77", "#d95f02", "#7570b3", "#e7298a"), labels = TRUE, quantities = TRUE)
# Predicting clinical host response
par(pty = "s")
regression_2 <- glm(venn.df$`Clinical Host Response Score` ~ rpm_sums_temp, data = data.frame(`Clinical Host Response Score` = venn.df$`Clinical Host Response Score`, rpm_sums_temp), family = binomial)
regression_2_probs <- predict(regression_2, type = "response")
auc_regression_2 <- roc(venn.df$`Clinical Host Response Score`, regression_2_probs)
print(auc_regression_2)
##
## Call:
## roc.default(response = venn.df$`Clinical Host Response Score`, predictor = regression_2_probs)
##
## Data: regression_2_probs in 99 controls (venn.df$`Clinical Host Response Score` 0) < 114 cases (venn.df$`Clinical Host Response Score` 1).
## Area under the curve: 0.9527
plot(auc_regression_2, col = "blue", main = "Pathogen Abundance Score Predicts Clinical Host Response", lwd = 3)
# Clinical taxonomy
regression_3 <- glm(venn.df$`Clinical Pathogen Abundance` ~ rpm_sums_temp, data = data.frame(`Clinical Pathogen Abundance` = venn.df$`Clinical Pathogen Abundance`, rpm_sums_temp), family = binomial)
regression_3_probs <- predict(regression_3, type = "response")
auc_regression_3 <- roc(venn.df$`Clinical Pathogen Abundance`, regression_3_probs)
print(auc_regression_3)
##
## Call:
## roc.default(response = venn.df$`Clinical Pathogen Abundance`, predictor = regression_3_probs)
##
## Data: regression_3_probs in 96 controls (venn.df$`Clinical Pathogen Abundance` 0) < 117 cases (venn.df$`Clinical Pathogen Abundance` 1).
## Area under the curve: 0.9878
plot(auc_regression_3, col = "blue", main = "Pathogen Abundance Score Predicts Clinical Pathogen", lwd = 3)
#' Goal: Find centroid gene in k-means cluster 1 that is closest to the average expression
#' profile of all genes
cluster_1_patients <- rownames(venn.df[venn.df$`RNA-Seq Host Response Cluster` == 1 , ])
cluster_1_TPM <- txi.data$abundance[ , cluster_1_patients]
centroid <- colMeans(cluster_1_TPM)
distances <- apply(cluster_1_TPM, 1, function(gene_expr) {
sqrt(sum((gene_expr - centroid)^2))
})
# 3. Find the gene with the smallest distance to the centroid
closest_gene <- names(which.min(distances))
# 4. Extract the expression profile of that gene (optional)
closest_gene_profile <- txi.data$abundance["ENSG00000119535", ]
closest_gene_profile <- closest_gene_profile[order(names(closest_gene_profile))]
closest_gene_profile <- closest_gene_profile[names(closest_gene_profile) != "20070"]
length(closest_gene_profile)
## [1] 213
# Predicting clinical host response
par(pty = "s")
regression_4 <- glm(venn.df$`Clinical Host Response Score` ~ closest_gene_profile, data = data.frame(`Clinical Host Response Score` = venn.df$`Clinical Host Response Score`, closest_gene_profile), family = binomial)
regression_4_probs <- predict(regression_4, type = "response")
auc_regression_4 <- roc(venn.df$`Clinical Host Response Score`, regression_4_probs)
print(auc_regression_4)
##
## Call:
## roc.default(response = venn.df$`Clinical Host Response Score`, predictor = regression_4_probs)
##
## Data: regression_4_probs in 99 controls (venn.df$`Clinical Host Response Score` 0) < 114 cases (venn.df$`Clinical Host Response Score` 1).
## Area under the curve: 0.9895
plot(auc_regression_4, col = "blue", main = "CSF3R Expression Predicts Clinical Host Response", lwd = 3)
# Predicting clinical taxonomy
regression_5 <- glm(venn.df$`Clinical Pathogen Abundance` ~ closest_gene_profile, data = data.frame(`Clinical Pathogen Abundance` = venn.df$`Clinical Pathogen Abundance`, closest_gene_profile), family = binomial)
regression_5_probs <- predict(regression_5, type = "response")
auc_regression_5 <- roc(venn.df$`Clinical Pathogen Abundance`, regression_5_probs)
print(auc_regression_5)
##
## Call:
## roc.default(response = venn.df$`Clinical Pathogen Abundance`, predictor = regression_5_probs)
##
## Data: regression_5_probs in 96 controls (venn.df$`Clinical Pathogen Abundance` 0) < 117 cases (venn.df$`Clinical Pathogen Abundance` 1).
## Area under the curve: 0.9674
plot(auc_regression_5, col = "blue", main = "CSF3R Expression Predicts Clinical Pathogen", lwd = 3)
# Predicting RNA-seq host response
par(pty = "s")
regression_6 <- glm(venn.df$`RNA-Seq Host Response Cluster` ~ closest_gene_profile, data = data.frame(`RNA-Seq Host Response Cluster` = venn.df$`RNA-Seq Host Response Cluster`, closest_gene_profile), family = binomial)
regression_6_probs <- predict(regression_6, type = "response")
auc_regression_6 <- roc(venn.df$`RNA-Seq Host Response Cluster`, regression_6_probs)
print(auc_regression_6)
##
## Call:
## roc.default(response = venn.df$`RNA-Seq Host Response Cluster`, predictor = regression_6_probs)
##
## Data: regression_6_probs in 109 controls (venn.df$`RNA-Seq Host Response Cluster` 0) < 104 cases (venn.df$`RNA-Seq Host Response Cluster` 1).
## Area under the curve: 0.9883
plot(auc_regression_6, col = "blue", main = "CSF3R Expression Predicts RNA-seq Host Response", lwd = 3)
# Predicting RNA-seq taxonomy
regression_7 <- glm(venn.df$`RNA-Seq Pathogen Abundance` ~ closest_gene_profile, data = data.frame(`RNA-Seq Pathogen Abundance` = venn.df$`RNA-Seq Pathogen Abundance`, closest_gene_profile), family = binomial)
regression_7_probs <- predict(regression_5, type = "response")
auc_regression_7 <- roc(venn.df$`RNA-Seq Pathogen Abundance`, regression_7_probs)
print(auc_regression_7)
##
## Call:
## roc.default(response = venn.df$`RNA-Seq Pathogen Abundance`, predictor = regression_7_probs)
##
## Data: regression_7_probs in 99 controls (venn.df$`RNA-Seq Pathogen Abundance` 0) < 114 cases (venn.df$`RNA-Seq Pathogen Abundance` 1).
## Area under the curve: 0.9708
plot(auc_regression_7, col = "blue", main = "CSF3R Expression Predicts RNA-seq Pathogen", lwd = 3)
pred.df <- venn.df
provide.grouping <- function(MT_HR, MT_Tax, Clin_HR, Clin_Tax) {
comb <- paste0(MT_HR, MT_Tax, Clin_HR, Clin_Tax)
# Define meaningful labels for each combination
group_labels <- list(
"1111" = "All_Positive",
"0000" = "All_Negative",
"1110" = "Clin_Tax_Negative",
"1101" = "Clin_HR_Negative",
"1011" = "MT_Tax_Negative",
"0111" = "MT_HR_Negative",
"1100" = "Clin_Tax_HR_Negative",
"1010" = "MT_Tax_Clin_Tax_Negative",
"1001" = "MT_Tax_Clin_HR_Negative",
"0110" = "MT_HR_Clin_Tax_Negative",
"0101" = "MT_HR_Clin_HR_Negative",
"0011" = "MT_HR_MT_Tax_Negative",
"1000" = "Only_MT_HR_Positive",
"0100" = "Only_MT_Tax_Positive",
"0010" = "Only_Clin_HR_Positive",
"0001" = "Only_Clin_Tax_Positive"
)
# Return the appropriate label or NA if not found
return(group_labels[[comb]])
}
# Apply function to all rows, giving a string value for each
pred.df$group <- mapply(provide.grouping,
pred.df[ , 1],
pred.df[ , 2],
pred.df[ , 3],
pred.df[ , 4])
# Working with df_renormalized, FYI
incorrect_patients <- rownames(pred.df[pred.df$group != "All_Negative" &
pred.df$group != "All_Positive", ])
df_renormalized_subset <- df_renormalized[incorrect_patients , ]
# Function that will generate a urobiome plot on this new df
generate.urobiome.plot.with.matrix <- function(rpm, patients, rpm.threshold, metadata) {
# These are required
require(ggplot2)
require(reshape2)
require(cowplot)
require(dplyr)
require(RColorBrewer)
# 1. Process microbiome data
rpm.subsetted <- rpm[patients, ]
sum.above.thres <- function(patient) {
current.row <- rpm.subsetted[patient, ]
patients.above <- current.row[current.row >= rpm.threshold]
sum.rpm.above <- sum(patients.above)
sum.rpm.below <- 1e6 - sum.rpm.above
patients.above <- c(patients.above, "other" = sum.rpm.below)
if (sum(patients.above) != 1e6) stop("RPM must sum to 1 million!")
patients.above <- patients.above[!names(patients.above) %in% "Homo sapiens"]
return(patients.above / sum(patients.above))
}
stacked_bar_data <- lapply(patients, sum.above.thres)
names(stacked_bar_data) <- patients
# 2. Create plotting data
plot_data <- bind_rows(lapply(seq_along(stacked_bar_data), function(i) {
data.frame(t(stacked_bar_data[[i]]), Patient = patients[i], stringsAsFactors = FALSE)
})) %>%
reshape2::melt(id.vars = "Patient", variable.name = "Taxa", value.name = "Abundance") %>%
mutate(Patient = factor(Patient, levels = patients))
# 3. Create color palette
taxa_list <- setdiff(unique(plot_data$Taxa), "other")
other_colors <- colorRampPalette(brewer.pal(12, "Set3"))(length(taxa_list))
fill_colours <- c("other" = "grey", setNames(other_colors, taxa_list))
# 4. Create main plot with its own legend
p_main <- ggplot(plot_data, aes(x = Patient, y = Abundance, fill = Taxa)) +
geom_bar(stat = "identity", width = 0.7) +
coord_flip() +
scale_fill_manual(values = fill_colours) +
labs(x = "Patient", y = "Proportion of Non-Human Reads") +
theme_minimal() +
theme(axis.text.y = element_text(size = 8))
# 5. Create test matrix with colored symbols
test_long_names <- c(
"MT_HR" = "RNA-seq\nHost\nResponse",
"MT_Tax" = "RNA-seq\nPathogen\nAbundance",
"Clin_HR" = "Clinical\nHost\nResponse",
"Clin_Tax" = "Clinical\nPathogen\nAbundance"
)
test_results <- data.frame(
Patient = factor(patients, levels = patients),
MT_HR = pred.df[patients, "RNA-Seq Host Response Cluster"],
MT_Tax = pred.df[patients, "RNA-Seq Pathogen Abundance"],
Clin_HR = pred.df[patients, "Clinical Host Response Score"],
Clin_Tax = pred.df[patients, "Clinical Pathogen Abundance"],
stringsAsFactors = FALSE
) %>%
reshape2::melt(id.vars = "Patient", variable.name = "Test", value.name = "Status")
p_matrix <- ggplot(test_results, aes(x = Test, y = Patient)) +
geom_text(aes(label = ifelse(Status == 1, "√", "X"),
color = ifelse(Status == 1, "Positive", "Negative")),
size = 4) +
scale_x_discrete(position = "top", labels = test_long_names) +
scale_color_manual(values = c("Positive" = "green4", "Negative" = "red")) +
theme_minimal() +
theme(
axis.title = element_blank(),
axis.text.y = element_blank(),
panel.grid = element_blank(),
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 9),
legend.position = "right",
legend.title = element_blank()
)
# 6. Remove legends from both plots
p_main <- p_main + theme(legend.position = "none")
p_matrix <- p_matrix + theme(legend.position = "none")
# 7. Combine plots without legends
aligned_plots <- align_plots(
p_main,
p_matrix,
align = "hv",
axis = "lr"
)
plot_grid(
aligned_plots[[1]], # Main plot
aligned_plots[[2]], # Matrix
nrow = 1,
rel_widths = c(3, 3),
align = "h",
axis = "tb"
)
}
# Generate main plot without legend
main_plot <- generate.urobiome.plot.with.matrix(
df_renormalized_subset,
incorrect_patients,
75000,
reads.meta.counts
)
print(main_plot)
## STILL NEED TO ADD THE LEGEND!!
# First ensure we keep all samples, including those with NA for mainpathogen
pca_df <- data.frame(
PC1 = pca_data$PC1,
PC2 = pca_data$PC2,
Sample = rownames(pca_data)
)
pca_df <- merge(pca_df,
reads.meta.counts[patients.over.thres, c("ptid", "mainpathogen")],
by.x = "Sample",
by.y = "ptid",
all.x = TRUE)
# Create a new column that handles NA values
pca_df$pathogen_group <- ifelse(is.na(pca_df$mainpathogen),
"NA",
as.character(pca_df$mainpathogen))
# Define custom color palette for pathogens (7 colors + grey for NA)
pathogen_colors <- c(
"E. coli" = "#1F77B4", # Blue
"E. coli / Proteus" = "#FF7F0E", # Orange
"Enterococcus" = "#2CA02C", # Green
"Klebsiella" = "#D62728", # Red
"Proteus" = "#9467BD", # Purple
"Pseudomonas Aeruginosa" = "#8C564B", # Brown
"NA" = "#a3a3a3" # Grey for missing
)
# Create the plot
pca_plot <- ggplot(pca_df, aes(x = PC1, y = PC2)) +
# Add cluster ellipses first (so points appear on top)
stat_ellipse(
aes(group = cluster, fill = factor(cluster)),
geom = "polygon",
alpha = 0.1,
color = NA,
show.legend = TRUE,
level = 0.95
) +
# Add points colored by pathogen
geom_point(
aes(color = pathogen_group),
size = 3,
alpha = 0.8
) +
# Custom color scales
scale_color_manual(
values = pathogen_colors,
na.value = "grey70",
breaks = names(pathogen_colors)[1:7]
) +
scale_fill_manual(
values = c("Cluster1" = "blue", "Cluster2" = "red"),
guide = guide_legend(override.aes = list(alpha = 0.3))
) +
# Labels and theme
labs(
title = "PCA by Pathogen",
x = "PC1",
y = "PC2",
color = "Main Pathogen",
fill = "Cluster"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 18),
axis.title = element_text(face = "bold", size = 16),
axis.text = element_text(size = 12),
legend.position = "right",
legend.title = element_text(face = "bold", size = 14),
legend.text = element_text(size = 12),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(color = "black", fill = NA, size = 1)
)
pca_plot
### Now create the same plot with 3D ###
# Perform k-means clustering using 3 PCs
kmeans_result_pca_3d <- kmeans(pca_df_3d[, c("PC1", "PC2", "PC3")], centers = 2, nstart = 25)
# Add cluster assignments to your data frame
pca_df_3d$mainpathogen <- ifelse(is.na(pca_df$mainpathogen),
"NA",
as.character(pca_df$mainpathogen))
# 3D PCA plot with k-means clusters
k_means_plot_3d <- plot_ly(
data = pca_df_3d,
x = ~PC1,
y = ~PC2,
z = ~PC3,
color = ~mainpathogen,
colors = pathogen_colors,
type = "scatter3d",
mode = "markers",
marker = list(size = 5, opacity = 0.8)
) %>%
layout(
title = list(text = "3D PCA Plot with clinical main pathogen", x = 0.5),
scene = list(
xaxis = list(title = "PC1"),
yaxis = list(title = "PC2"),
zaxis = list(title = "PC3")
)
)
# Display the plot
k_means_plot_3d
set.seed(1234)
pca_df_4f <- pca_df
pca_df_4f <- pca_df_4f[pca_df_4f$Sample %in% rownames(pred.df) , ]
if (all(pca_df_4f$Sample == rownames(pred.df))) {
pca_df_4f$group <- pred.df$group
}
new_cluster <- cluster[-(which(pca_df$Sample == "20070"))]
pca_df_4f$cluster <- kmeans_result_pca$new_cluster
# Create the plot
pca_plot_2 <- ggplot(pca_df_4f, aes(x = PC1, y = PC2)) +
# Add cluster ellipses first (so points appear on top)
stat_ellipse(
aes(group = new_cluster, fill = factor(new_cluster)),
geom = "polygon",
alpha = 0.3,
color = "black", # outline to make visible
level = 0.95,
show.legend = TRUE
) +
# Add points colored by pathogen
geom_point(
aes(color = group),
size = 3,
alpha = 0.8
) +
scale_fill_manual(
values = c("Cluster1" = "blue", "Cluster2" = "red"),
guide = guide_legend(override.aes = list(alpha = 0.3))
) +
# Labels and theme
labs(
title = "PCA by venn group",
x = "PC1",
y = "PC2",
color = "Group",
fill = "Cluster"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 18),
axis.title = element_text(face = "bold", size = 16),
axis.text = element_text(size = 12),
legend.position = "right",
legend.title = element_text(face = "bold", size = 14),
legend.text = element_text(size = 12),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(color = "black", fill = NA, size = 1)
)
pca_plot_2
### Now create the same plot with 3D ###
pca_df_4f_3d <- pca_df_3d
pca_df_4f_3d <- pca_df_4f_3d[-(which(rownames(pca_df_4f_3d) == "20070")) , ]
pca_df_4f_3d$group <- pca_df_4f$group
# 3D PCA plot with k-means clusters
pca_plot_4f_3d <- plot_ly(
data = pca_df_4f_3d,
x = ~PC1,
y = ~PC2,
z = ~PC3,
color = ~group,
type = "scatter3d",
mode = "markers",
marker = list(size = 5, opacity = 0.8)
) %>%
layout(
title = list(text = "3D PCA Plot with venn diagram group colours", x = 0.5),
scene = list(
xaxis = list(title = "PC1"),
yaxis = list(title = "PC2"),
zaxis = list(title = "PC3")
)
)
# Display the plot
pca_plot_4f_3d
# First for LE
pca_df_4f$LE_discrete <- reads.meta.counts[-(which(pca_df$Sample == "20070")) , ]$enrleuks
pca_df_4f_filtered <- pca_df_4f[pca_df_4f$group %in% c("All_Positive", "MT_HR_Negative"), ]
comp_1 <- ggplot(pca_df_4f_filtered, aes(x = factor(group), y = LE_discrete, fill = factor(group))) +
geom_violin(trim = FALSE, show.legend = FALSE) + # Violin plot with no trim and without legend
geom_boxplot(width = 0.1, color = "black", fill = "white", alpha = 0.6, outlier.shape = NA) + # Boxplot inside violins
labs(x = "Discrepancy Group", y = "Absolute Leukocyte Esterase Activity", title = "Distribution of Factor Across All_Positive and MT_HR_Negative") +
scale_fill_manual(values = c("All_Positive" = "#1f77b4", "MT_HR_Negative" = "#ff7f0e")) + # Custom colors for the groups
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1)) # Add a border around the plot
wilcox.test(LE_discrete ~ group, data = pca_df_4f_filtered)
##
## Wilcoxon rank sum test with continuity correction
##
## data: LE_discrete by group
## W = 668, p-value = 0.0003453
## alternative hypothesis: true location shift is not equal to 0
print(comp_1)
# Next for pathogen abundance
pca_df_4f$rpm_sums <- rpm_sums[names(rpm_sums) != "20070"]
pca_df_4f_filtered <- pca_df_4f[pca_df_4f$group %in% c("All_Positive", "MT_HR_Negative"), ]
comp_2 <- ggplot(pca_df_4f_filtered, aes(x = factor(group), y = 10^rpm_sums, fill = factor(group))) +
geom_violin(trim = FALSE, show.legend = FALSE) + # Violin plot with no trim and without legend
geom_boxplot(width = 0.1, color = "black", fill = "white", alpha = 0.6, outlier.shape = NA) + # Boxplot inside violins
labs(x = "Discrepancy Group", y = "Pathogen RPM", title = "Distribution of Factor Across All_Positive and MT_HR_Negative") +
scale_fill_manual(values = c("All_Positive" = "purple", "MT_HR_Negative" = "darkgreen")) + # Custom colors for the groups
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1)) # Add a border around the plot
wilcox.test(10^rpm_sums ~ group, data = pca_df_4f_filtered)
##
## Wilcoxon rank sum test with continuity correction
##
## data: 10^rpm_sums by group
## W = 566, p-value = 0.04468
## alternative hypothesis: true location shift is not equal to 0
print(comp_2)
xCell_txi <- txi.data$abundance
rownames(xCell_txi) <- res.df.human$symbol
xcell_results <- xCellAnalysis(xCell_txi)
## [1] "Num. of genes: 10362"
## No annotation package name available in the input data object.
## Attempting to directly match identifiers in data to gene sets.
## Estimating ssGSEA scores for 489 gene sets.
## [1] "Calculating ranks..."
## [1] "Calculating absolute values from ranks..."
## | | | 0% | |= | 1% | |= | 2% | |== | 2% | |== | 3% | |=== | 4% | |=== | 5% | |==== | 5% | |==== | 6% | |===== | 7% | |====== | 8% | |====== | 9% | |======= | 9% | |======= | 10% | |======== | 11% | |======== | 12% | |========= | 12% | |========= | 13% | |========= | 14% | |========== | 14% | |========== | 15% | |=========== | 15% | |=========== | 16% | |============ | 17% | |============ | 18% | |============= | 18% | |============= | 19% | |============== | 20% | |============== | 21% | |=============== | 21% | |=============== | 22% | |================ | 22% | |================ | 23% | |================= | 24% | |================= | 25% | |================== | 25% | |================== | 26% | |=================== | 27% | |=================== | 28% | |==================== | 28% | |==================== | 29% | |===================== | 29% | |===================== | 30% | |====================== | 31% | |====================== | 32% | |======================= | 32% | |======================= | 33% | |======================== | 34% | |======================== | 35% | |========================= | 35% | |========================= | 36% | |========================== | 36% | |========================== | 37% | |========================== | 38% | |=========================== | 38% | |=========================== | 39% | |============================ | 40% | |============================ | 41% | |============================= | 41% | |============================= | 42% | |============================== | 43% | |=============================== | 44% | |=============================== | 45% | |================================ | 45% | |================================ | 46% | |================================= | 47% | |================================= | 48% | |================================== | 48% | |================================== | 49% | |=================================== | 50% | |==================================== | 51% | |==================================== | 52% | |===================================== | 52% | |===================================== | 53% | |====================================== | 54% | |====================================== | 55% | |======================================= | 55% | |======================================= | 56% | |======================================== | 57% | |========================================= | 58% | |========================================= | 59% | |========================================== | 59% | |========================================== | 60% | |=========================================== | 61% | |=========================================== | 62% | |============================================ | 62% | |============================================ | 63% | |============================================ | 64% | |============================================= | 64% | |============================================= | 65% | |============================================== | 65% | |============================================== | 66% | |=============================================== | 67% | |=============================================== | 68% | |================================================ | 68% | |================================================ | 69% | |================================================= | 70% | |================================================= | 71% | |================================================== | 71% | |================================================== | 72% | |=================================================== | 72% | |=================================================== | 73% | |==================================================== | 74% | |==================================================== | 75% | |===================================================== | 75% | |===================================================== | 76% | |====================================================== | 77% | |====================================================== | 78% | |======================================================= | 78% | |======================================================= | 79% | |======================================================== | 79% | |======================================================== | 80% | |========================================================= | 81% | |========================================================= | 82% | |========================================================== | 82% | |========================================================== | 83% | |=========================================================== | 84% | |=========================================================== | 85% | |============================================================ | 85% | |============================================================ | 86% | |============================================================= | 86% | |============================================================= | 87% | |============================================================= | 88% | |============================================================== | 88% | |============================================================== | 89% | |=============================================================== | 90% | |=============================================================== | 91% | |================================================================ | 91% | |================================================================ | 92% | |================================================================= | 93% | |================================================================== | 94% | |================================================================== | 95% | |=================================================================== | 95% | |=================================================================== | 96% | |==================================================================== | 97% | |==================================================================== | 98% | |===================================================================== | 98% | |===================================================================== | 99% | |======================================================================| 100%
xcell_results_scaled <- xcell_results[ , colnames(xcell_results) != "20070"]
xcell_results_scaled <- t(scale(t(xcell_results_scaled)))
# Ensure the order matches between pred.df and xcell_results_scaled
binary_annots <- pred.df[, 1:4]
binary_annots <- binary_annots[colnames(xcell_results_scaled), ] # reordering to match
# Step 1: Get ordering based on the annotation column
ordering <- order(pred.df$`RNA-Seq Host Response Cluster`, decreasing = TRUE)
# Step 2: Reorder both the data and annotations
xcell_results_scaled_ordered <- xcell_results_scaled[, ordering]
binary_annots_ordered <- pred.df[ordering, 1:4]
# Step 3: Rebuild the annotation with reordered rows
ha <- HeatmapAnnotation(
df = binary_annots_ordered,
col = list(
`RNA-Seq Host Response Cluster` = c("0" = "grey", "1" = "red"),
`RNA-Seq Pathogen Abundance` = c("0" = "grey", "1" = "blue"),
`Clinical Host Response Score` = c("0" = "grey", "1" = "darkgreen"),
`Clinical Pathogen Abundance` = c("0" = "grey", "1" = "purple")
),
show_annotation_name = TRUE,
annotation_name_side = "left"
)
# Step 4: Redraw the heatmap with sorted columns
xcell_heatmap <- Heatmap(
xcell_results_scaled_ordered,
top_annotation = ha,
col = colorRamp2(c(-2, 0, 2), c("blue", "white", "red")),
cluster_columns = FALSE,
column_names_gp = gpar(fontsize = 0), # hide patient IDs
row_names_gp = gpar(fontsize = 8), # smaller y-axis labels
name = "xCell score" # rename legend
)
# Draw the heatmap
print(xcell_heatmap)